/*
* Copyright (C) 2014 Jörg Prante
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program; if not, see http://www.gnu.org/licenses
* or write to the Free Software Foundation, Inc., 51 Franklin Street,
* Fifth Floor, Boston, MA 02110-1301 USA.
*
* The interactive user interfaces in modified source and object code
* versions of this program must display Appropriate Legal Notices,
* as required under Section 5 of the GNU Affero General Public License.
*
*/
package org.xbib.elasticsearch.index.analysis.langdetect;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.fielddata.FieldDataType;
import org.elasticsearch.index.mapper.FieldMapperListener;
import org.elasticsearch.index.mapper.Mapper;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.MergeContext;
import org.elasticsearch.index.mapper.MergeMappingException;
import org.elasticsearch.index.mapper.ObjectMapperListener;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.index.mapper.core.AbstractFieldMapper;
import org.elasticsearch.index.mapper.core.StringFieldMapper;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.List;
import java.util.Map;
import static org.elasticsearch.index.mapper.MapperBuilders.stringField;
public class LangdetectMapper extends AbstractFieldMapper<Object> {
public static final String CONTENT_TYPE = "langdetect";
public static class Builder extends AbstractFieldMapper.Builder<Builder, LangdetectMapper> {
private StringFieldMapper.Builder contentBuilder;
private StringFieldMapper.Builder langBuilder;
private ImmutableSettings.Builder settingsBuilder;
public Builder(String name) {
super(name, new FieldType(Defaults.FIELD_TYPE));
this.builder = this;
this.contentBuilder = stringField(name);
this.langBuilder = stringField("lang");
this.settingsBuilder = ImmutableSettings.settingsBuilder();
}
public Builder content(StringFieldMapper.Builder content) {
this.contentBuilder = content;
return this;
}
public Builder lang(StringFieldMapper.Builder lang) {
this.langBuilder = lang;
return this;
}
public Builder ntrials(int trials) {
settingsBuilder.put("number_of_trials", trials);
return this;
}
public Builder alpha(double alpha) {
settingsBuilder.put("alpha", alpha);
return this;
}
public Builder alphaWidth(double alphaWidth) {
settingsBuilder.put("alpha_width", alphaWidth);
return this;
}
public Builder iterationLimit(int iterationLimit) {
settingsBuilder.put("iteration_limit", iterationLimit);
return this;
}
public Builder probThreshold(double probThreshold) {
settingsBuilder.put("prob_threshold", probThreshold);
return this;
}
public Builder convThreshold(double convThreshold) {
settingsBuilder.put("conv_threshold", convThreshold);
return this;
}
public Builder baseFreq(int baseFreq) {
settingsBuilder.put("base_freq", baseFreq);
return this;
}
public Builder pattern(String pattern) {
settingsBuilder.put("pattern", pattern);
return this;
}
public Builder max(int max) {
settingsBuilder.put("max", max);
return this;
}
public Builder binary(boolean binary) {
settingsBuilder.put("binary", binary);
return this;
}
public Builder map(Map<String,String> map) {
for (String key : map.keySet()) {
settingsBuilder.put("map." + key, map.get(key));
}
return this;
}
public Builder languages(List<String> languages) {
settingsBuilder.putArray("languages", languages.toArray(new String[languages.size()]));
return this;
}
@Override
public LangdetectMapper build(BuilderContext context) {
context.path().add(name);
StringFieldMapper contentMapper = contentBuilder.build(context);
StringFieldMapper langMapper = langBuilder.build(context);
context.path().remove();
LangdetectService detector = new LangdetectService(settingsBuilder.build());
detector.start();
return new LangdetectMapper(new Names(name), contentMapper, langMapper, detector);
}
}
public static class TypeParser implements Mapper.TypeParser {
@SuppressWarnings({"unchecked", "rawtypes"})
@Override
public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext parserContext)
throws MapperParsingException {
LangdetectMapper.Builder builder = new Builder(name);
for (Map.Entry<String, Object> entry : node.entrySet()) {
String fieldName = entry.getKey();
Object fieldNode = entry.getValue();
switch (fieldName) {
case "fields": {
Map<String, Object> fieldsNode = (Map<String, Object>) fieldNode;
for (Map.Entry<String, Object> fieldsEntry : fieldsNode.entrySet()) {
String propName = fieldsEntry.getKey();
Object propNode = fieldsEntry.getValue();
if (name.equals(propName)) {
builder.content((StringFieldMapper.Builder) parserContext.typeParser("string").parse(name,
(Map<String, Object>) propNode, parserContext));
} else if ("lang".equals(propName)) {
builder.lang((StringFieldMapper.Builder) parserContext.typeParser("string").parse("lang",
(Map<String, Object>) propNode, parserContext));
}
}
break;
}
case "number_of_trials": {
builder.ntrials((Integer)fieldNode);
break;
}
case "alpha": {
builder.alpha((Double)fieldNode);
break;
}
case "alpha_width": {
builder.alphaWidth((Double)fieldNode);
break;
}
case "iteration_limit": {
builder.iterationLimit((Integer)fieldNode);
break;
}
case "prob_threshold": {
builder.probThreshold((Double)fieldNode);
break;
}
case "conv_threshold": {
builder.convThreshold((Double)fieldNode);
break;
}
case "base_freq": {
builder.baseFreq((Integer)fieldNode);
break;
}
case "pattern": {
builder.pattern((String)fieldNode);
break;
}
case "max": {
builder.max((Integer)fieldNode);
break;
}
case "binary": {
builder.binary((Boolean)fieldNode);
break;
}
case "map" : {
builder.map((Map<String,String>)fieldNode);
break;
}
case "languages" : {
builder.languages((List<String>)fieldNode);
break;
}
}
}
return builder;
}
}
private final StringFieldMapper contentMapper;
private final StringFieldMapper langMapper;
private final LangdetectService detector;
public LangdetectMapper(Names names, StringFieldMapper contentMapper, StringFieldMapper langMapper,
LangdetectService detector) {
super(names, 1.0f, Defaults.FIELD_TYPE, false, null, null, null, null, null, null, null, null, null, null);
this.contentMapper = contentMapper;
this.langMapper = langMapper;
this.detector = detector;
}
@Override
public FieldType defaultFieldType() {
return Defaults.FIELD_TYPE;
}
@Override
public FieldDataType defaultFieldDataType() {
return null;
}
@Override
public Object value(Object value) {
return null;
}
@Override
public void parse(ParseContext context) throws IOException {
String content = null;
XContentParser parser = context.parser();
XContentParser.Token token = parser.currentToken();
if (token == XContentParser.Token.VALUE_STRING) {
content = parser.text();
if (detector.getSettings().getAsBoolean("binary", false)) {
try {
byte[] b = parser.binaryValue();
if (b != null && b.length > 0) {
content = new String(b, Charset.forName("UTF-8"));
}
} catch (Exception e) {
}
}
}
if (content == null) {
return;
}
context = context.createExternalValueContext(content);
contentMapper.parse(context);
try {
List<Language> langs = detector.detectAll(content);
for (Language lang : langs) {
context = context.createExternalValueContext(lang.getLanguage());
langMapper.parse(context);
}
} catch (LanguageDetectionException e) {
context = context.createExternalValueContext("unknown");
langMapper.parse(context);
}
}
@Override
protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException {
}
@Override
public void merge(Mapper mergeWith, MergeContext mergeContext) throws MergeMappingException {
}
@Override
public void traverse(FieldMapperListener fieldMapperListener) {
contentMapper.traverse(fieldMapperListener);
langMapper.traverse(fieldMapperListener);
}
@Override
public void traverse(ObjectMapperListener objectMapperListener) {
}
@Override
public void close() {
contentMapper.close();
langMapper.close();
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(name());
builder.field("type", CONTENT_TYPE);
builder.startObject("fields");
contentMapper.toXContent(builder, params);
langMapper.toXContent(builder, params);
builder.endObject();
builder.endObject();
return builder;
}
@Override
protected String contentType() {
return CONTENT_TYPE;
}
}